Exploring Different Methods of Clustering with Embedding Vectors¶
import openai
import dotenv
import os
import numpy as np
from typing import List
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score
import warnings
from sklearn.cluster import DBSCAN
import umap
import plotly.express as px
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
from scipy.cluster.hierarchy import dendrogram, linkage
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
import plotly.offline as pyo
#import plotly.io as pio
#pio.renderers.default='iframe'
warnings.filterwarnings("ignore")
dotenv.load_dotenv(override=True)
open_ai_token = os.getenv("OPEN_AI_API_KEY")
openai.api_key = open_ai_token
def get_embeddings(text_list:list, model:str = 'text-embedding-ada-002')->List[np.array]:
'''
Returns a list of embedding vectors from ChatGPT as NumPy arrays.
Args:
text_list (list): A list of strings for which embeddings are to be generated.
model (str, optional): The model to use for generating embeddings. Defaults to 'text-embedding-ada-002'.
Returns:
List[np.ndarray]: A list of NumPy arrays where each array is an embedding vector corresponding to the input text.
'''
response = openai.embeddings.create(input = text_list, model = 'text-embedding-ada-002')
embeddings = [np.array(li.embedding) for li in response.data]
return embeddings
get_embeddings(['dog'])
[array([-0.00348209, -0.01784995, -0.01628132, ..., -0.006332 ,
0.00458419, -0.01899938])]
terms = {
"Supervised Learning": [
"Linear Regression",
"Logistic Regression",
"Support Vector Machine",
"Decision Tree",
"Random Forest",
"Gradient Boosting",
"Neural Network",
"K-Nearest Neighbors",
"Naive Bayes",
"Ridge Regression"
],
"Unsupervised Learning": [
"K-Means Clustering",
"Hierarchical Clustering",
"Principal Component Analysis",
"Autoencoder",
"t-SNE",
"DBSCAN",
"Gaussian Mixture Model",
"Latent Dirichlet Allocation",
"Independent Component Analysis",
"Self-Organizing Map"
]
}
term_list = [term for val in terms.values() for term in val]
term_embeddings = get_embeddings(term_list)
len(term_embeddings), len(term_list)
(20, 20)
def plot_terms_with_umap(embeddings:list, terms:list, class_list:list, title:str):
'''
Plots the given terms using UMAP dimensionality reduction and Plotly.
Args:
terms (dict): A dictionary where keys are class names and values are lists of terms.
Returns:
None: Displays a 2D Plotly scatter plot of the terms.
'''
embeddings = get_embeddings(term_list)
reducer = umap.UMAP(n_components=2, random_state=42)
embedding_2d = reducer.fit_transform(embeddings)
fig = px.scatter(
x=embedding_2d[:, 0],
y=embedding_2d[:, 1],
text=term_list,
color=class_list,
labels={'color': 'Class'},
title=title
)
fig.update_traces(marker=dict(size=8), textposition='top center')
fig.update_layout(autosize=False, width=1200, height=800)
fig.show()
class_list = [key for key, val in terms.items() for _ in val]
plot_terms_with_umap(term_embeddings, term_list, class_list, title='UMAP Dimensionality Reduction of ML Terms')
Spectral Clustering¶
Spectral Clustering is based on graph theory. It is particularly effective for clustering non-linearly separable data or data with complex structures. There are four major steps this algorithim completes. First, it constructs a similarity graph with a distance metric such as k-nearest neighbors. Then it computes a laplacian matrix, which encodes information about the graph structure. Third, egienvalue dcomposition is used, and fourth a clustering algorithim is applied in lower dimensional space.
Advantages¶
Handles Complex Structures: Spectral clustering is particularly effective for identifying clusters with non-convex shapes or varying densities, which are challenging for traditional clustering methods. No Need for Initial Centroids: Unlike K-Means, it doesn't require the specification of initial centroids, making it less sensitive to initial conditions.
Disadvantages¶
Computationally Intensive: Spectral clustering requires the computation of the similarity matrix and the eigenvalue decomposition, which can be computationally expensive, especially for large datasets. The performance of spectral clustering depends on the choice of the similarity measure, the number of neighbors (in k-nearest neighbors graph), and the number of clusters.
from sklearn.cluster import SpectralClustering
def spectral_clustering(evs, n_clusters: int, affinity: str = 'nearest_neighbors', assign_labels: str = 'kmeans') -> np.ndarray:
"""
Performs spectral clustering on the given embedding vectors.
Args:
evs (Union[List[List[float]], np.ndarray]): The embedding vectors to cluster.
n_clusters (int): The number of clusters to form.
affinity (str): How to construct the affinity matrix. Options include 'nearest_neighbors', 'rbf', etc.
assign_labels (str): The strategy to use to assign labels. Options include 'kmeans', 'discretize'.
Returns:
np.ndarray: Array of cluster labels assigned to each data point.
"""
spectral = SpectralClustering(
n_clusters=n_clusters,
affinity=affinity,
assign_labels=assign_labels,
random_state=42
)
cluster_labels = spectral.fit_predict(np.array(evs))
return cluster_labels
spectral_class_yhat = spectral_clustering(term_embeddings, n_clusters=2)
plot_terms_with_umap(term_embeddings, term_list, spectral_class_yhat, title='UMAP Dimensionality Reduction of ML Terms w/ Spectral Clustering')
Scoring Metrics:¶
Adjusted Rand Index (ARI):¶
Measures the similarity between two data clusterings, considering all pairs of samples and counting pairs that are assigned in the same or different clusters in the predicted and true clusterings. Adjusted for chance, with a range of -1 to 1, where 1 indicates perfect clustering.
Normalized Mutual Information (NMI):¶
Measures the amount of information obtained about one clustering from the other clustering. Ranges from 0 to 1, where 1 indicates perfect clustering.
ari_score = adjusted_rand_score(class_list, spectral_class_yhat)
print(ari_score)
nmi_score = normalized_mutual_info_score(class_list, spectral_class_yhat)
print(nmi_score)
0.7995558023320377 0.7610260716417302
DBSCAN (Density-Based Spatial Clustering of Applications with Noise)¶
DBSCAN is a popular clustering algorithm that groups together points that are closely packed and marks points that are in low-density regions (e.g., points that are outliers).
How DBSCAN Works¶
- Core Points: Points that have a minimum number of neighbors within a specified radius are considered core points.
- Density Reachable: Points that are within the specified radius of a core point are considered density-reachable and belong to the same cluster.
- Noise Points: Points that are neither core points nor reachable from any core points are classified as noise or outliers.
Advantages¶
- No Need for Pre-Defined Clusters: Unlike K-Means, DBSCAN doesn't require specifying the number of clusters beforehand.
- Handles Arbitrary Shapes: It can find clusters of arbitrary shapes, making it ideal for datasets with clusters that aren't well-separated or spherical.
- Robust to Outliers: DBSCAN effectively identifies and handles outliers as noise points.
Disadvantages¶
- Sensitive to Parameters: The performance of DBSCAN depends on the choice of the radius (
eps) and the minimum number of points (min_samples). Poor choices can lead to poor clustering results. - Difficulty with Varying Densities: DBSCAN may struggle with datasets where clusters have varying densities, as it relies on a fixed radius parameter.
Parameter Tuning¶
- Start with smaller versions of EPS and Min Samples and gradually increase both.
- To explore more in the future - automate parameter tuning, and look at sillhouette score
def dbscan_clustering(evs, eps: float = 0.5, min_samples: int = 1) -> np.ndarray:
"""
Performs DBSCAN clustering on the given data.
Args:
evs: The data to cluster.
eps (float): The maximum distance between two samples for one to be considered as in the neighborhood of the other.
min_samples (int): The number of samples in a neighborhood for a point to be considered as a core point.
Returns:
np.ndarray: Array of cluster labels assigned to each data point.
"""
dbscan = DBSCAN(eps=eps, min_samples=min_samples)
cluster_labels = dbscan.fit_predict(np.array(evs))
return cluster_labels
dbscan_class_yhat = dbscan_clustering(term_embeddings, eps=0.57, min_samples=1)
plot_terms_with_umap(term_embeddings, term_list, dbscan_class_yhat, title='UMAP Dimensionality Reduction of ML Terms w/ DBSCAN Clustering')
ari_score = adjusted_rand_score(class_list, dbscan_class_yhat)
print(f'ARI Score for DBSCAN: {ari_score}')
nmi_score = normalized_mutual_info_score(class_list, dbscan_class_yhat)
print(f'NMI Score for DBSCAN: {nmi_score}')
ARI Score for DBSCAN: 0.0446927374301676 NMI Score for DBSCAN: 0.2762205461906752
Agglomerative Clustering (Hierarchical Clustering)¶
Agglomerative Clustering is a type of hierarchical clustering method that builds nested clusters by successively merging or splitting clusters based on a predefined distance metric.
How Agglomerative Clustering Works¶
- Start with Individual Points: Initially, each data point is considered as its own cluster.
- Merge Clusters Iteratively: At each step, the two clusters that are closest to each other (according to a chosen distance metric) are merged.
- Continue Until All Points Are in a Single Cluster: This process continues until all data points are grouped into a single cluster, or a predefined number of clusters is reached.
Linkage Criteria¶
Agglomerative clustering uses different linkage criteria to determine the distance between clusters:
- Single Linkage: Distance between the closest points of the clusters.
- Complete Linkage: Distance between the farthest points of the clusters.
- Average Linkage: Average distance between all points in the clusters.
- Ward’s Linkage: Minimizes the variance within clusters when merging.
Advantages¶
- No Need to Specify the Number of Clusters: Unlike K-Means, Agglomerative Clustering does not require you to predefine the number of clusters.
- Hierarchical Structure: Provides a dendrogram (a tree-like diagram) that shows the hierarchy of clusters and allows for flexible cluster selection by cutting the tree at different levels.
- Effective for Various Data Structures: Can work well with data that has a nested or hierarchical structure.
Disadvantages¶
- Computationally Intensive: As the dataset grows, the computation becomes more expensive, especially for large datasets, due to the need to compute pairwise distances.
- Sensitive to Noise and Outliers: Outliers can significantly affect the clustering process, especially in certain linkage methods like single linkage.
- Choice of Linkage Criteria: The clustering result can vary significantly depending on the chosen linkage criterion, and selecting the appropriate one can be challenging.
def agglomerative_clustering(evs, n_clusters: int, metric: str = 'cosine', linkage: str = 'average') -> np.ndarray:
"""
Performs agglomerative clustering on the given data.
Args:
evs: The data to cluster.
n_clusters (int): The number of clusters to form.
metric (str): The distance metric to use for clustering. Common options include 'cosine', 'euclidean', etc.
linkage (str): The linkage criterion to use. Options include 'ward', 'complete', 'average', and 'single'.
Returns:
np.ndarray: Array of cluster labels assigned to each data point.
"""
agglomerative = AgglomerativeClustering(
n_clusters=n_clusters,
metric=metric,
linkage=linkage
)
cluster_labels = agglomerative.fit_predict(np.array(evs))
return cluster_labels
agglo_class_yhat = agglomerative_clustering(term_embeddings, n_clusters=2)
plot_terms_with_umap(term_embeddings, term_list, agglo_class_yhat, title='UMAP Dimensionality Reduction of ML Terms w/ Agglomerative Clustering')
ari_score = adjusted_rand_score(class_list, agglo_class_yhat)
print(f'ARI Score for Agglomerative Clustering: {ari_score}')
nmi_score = normalized_mutual_info_score(class_list, agglo_class_yhat)
print(f'NMI Score for Agglomerative Clustering: {nmi_score}')
ARI Score for Agglomerative Clustering: 0.02145922746781116 NMI Score for Agglomerative Clustering: 0.14708219223672442
Dendrograms & Agglomerative Clustering¶
One benefit here is the ability to build a hierarchical plot of clusters, this could be useful in different use cases, and should be explored further.
def agglomerative_clustering(evs, n_clusters, linkage_method, metric='cosine',):
clustering = AgglomerativeClustering(
n_clusters=n_clusters,
linkage=linkage_method,
metric=metric
)
cluster_labels = clustering.fit_predict(evs)
return cluster_labels
def plot_dendrogram(evs, method):
Z = linkage(evs, method=method)
plt.figure(figsize=(15, 5))
dendrogram(Z)
plt.show()
plot_dendrogram(term_embeddings, method='single')
cluster_labels = agglomerative_clustering(term_embeddings, n_clusters=2, linkage_method='single')
sil_score = silhouette_score(term_embeddings, cluster_labels)
ch_score = calinski_harabasz_score(term_embeddings, cluster_labels)
db_score = davies_bouldin_score(term_embeddings, cluster_labels)
print(f'Silhouette Score: {sil_score}')
print(f'Calinski-Harabasz Score: {ch_score}')
print(f'Davies-Bouldin Score: {db_score}')
ari_score = adjusted_rand_score(class_list, cluster_labels)
print(f'ARI Score for Agglomerative Clustering: {ari_score}')
nmi_score = normalized_mutual_info_score(class_list, cluster_labels)
print(f'NMI Score for Agglomerative Clustering: {nmi_score}')
cluster_labels
Silhouette Score: 0.10318005466766267 Calinski-Harabasz Score: 2.1112809948284372 Davies-Bouldin Score: 1.3815156998170752 ARI Score for Agglomerative Clustering: 0.02145922746781116 NMI Score for Agglomerative Clustering: 0.14708219223672442
array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0],
dtype=int64)
KMeans¶
This is a well understood clustering model, no need to elaborate.
def kmeans_clustering(evs, n_clusters, init='k-means++', max_iter=300):
kmeans = KMeans(
n_clusters=n_clusters,
init=init,
max_iter=max_iter,
random_state=42
)
cluster_labels = kmeans.fit_predict(np.array(evs))
return cluster_labels
kmeans_class_yhat = kmeans_clustering(term_embeddings, n_clusters=2)
plot_terms_with_umap(term_embeddings, term_list, kmeans_class_yhat, title='UMAP Dimensionality Reduction of ML Terms w/ KMeans Clustering')
ari_score = adjusted_rand_score(class_list, kmeans_class_yhat)
print(f'ARI Score for KMeans: {ari_score}')
nmi_score = normalized_mutual_info_score(class_list, kmeans_class_yhat)
print(f'NMI Score for KMeans: {nmi_score}')
ARI Score for KMeans: 0.0 NMI Score for KMeans: 0.08068918390116703
!pip install plotly==5.24.1
Requirement already satisfied: plotly==5.24.1 in c:\users\kmsta\anaconda3\lib\site-packages (5.24.1) Requirement already satisfied: tenacity>=6.2.0 in c:\users\kmsta\anaconda3\lib\site-packages (from plotly==5.24.1) (8.2.3) Requirement already satisfied: packaging in c:\users\kmsta\anaconda3\lib\site-packages (from plotly==5.24.1) (24.1)
!jupyter nbconvert --to html clustering_word_ev_explore.ipynb
[NbConvertApp] Converting notebook clustering_word_ev_explore.ipynb to html [NbConvertApp] WARNING | Alternative text is missing on 1 image(s). [NbConvertApp] Writing 4958942 bytes to clustering_word_ev_explore.html